#!/usr/bin/env Rscript

"> methepic_emseq_wgbs_per-pos_beta.R <

Reads in betas from MethylationEPIC arrays, EM-seq and WGBS, and compares them 
on a per-position basis.

For short read tech (EM-seq/WGBS), coverage cutoffs are defined as having
at least 3 of 4 samples from the same method having a coverage of 5 and
above (i.e., EM-seq 3 of 4 with coverage >= 5 && WGBS 3 of 4 with coverage >= 5).
MethylationEPIC readouts are 'analogue' so no 'coverage' to deal with.
" -> doc

setwd('~/csiro/stopwatch/cpgberus/14_methepic_vs_emseq_wgbs/')

# colour codes are from Dark2 panel
EMSEQ_COLOR <- '#1b9e77'
EPIC_COLOR <- '#e7298a'
WGBS_COLOR <- '#7570b3'

suppressPackageStartupMessages({
  library(Biostrings)
  library(BSgenome.Hsapiens.UCSC.hg38)
  library(cowplot)
  library(ggplot2)
  library(RColorBrewer)
})

# function to pretty-print diagnostic messages
diag_message <- function(...) {
  message('[', format(Sys.time(), "%H:%M:%S"), '] ', ...)
}

# function to annotate line of best fit in scatterplot
lm_eqn <- function(df, x, y) {
  # modified from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
  model <- lm(as.formula(paste(y, '~', x)), df)
  eq <- substitute(
    italic(y) == c + m %.% italic(x)*','~italic(r) == rval*','~italic(p) == pval*','~italic(n) == n_df,
    list(c = format(unname(coef(model)[1]), digits=3),
         m = format(unname(coef(model)[2]), digits=3),
         rval = format(cor(df[[x]], df[[y]]), digits=4),
         pval = format(summary(model)$coefficients[2,4], digits=1),
         n_df = format(nrow(df), big.mark=',')))
  as.character(as.expression(eq))
}

# function to calculate sum/len of a binary string
pct_of_ones <- function(binary_string) {
  int_vector <- as.integer(unlist(strsplit(binary_string, split = "")))
  
  sum(int_vector) / length(int_vector) * 100
}

# load processed EPIC data
load('epic_betas_hg38.RData')

# fix EPIC GRanges--all of the current positions refer to the 'C' on the Watson
# strand. this is correct when strand is '+', but not correct when strand is
# '-' (methylated C on Crick would be the G on Watson)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
##              seqnames    ranges strand |  WR025V1I  WR025V9I  WR069V1I
##                 <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric>
##   cg18478105    chr20  63216298      + | 0.0173053 0.0199615 0.0217486
##   cg09835024     chrX  24054523      + | 0.0326714 0.0517594 0.0346174
##   cg14361672     chr9 128701657      - | 0.8865378 0.8594833 0.8773512
##   cg01763666    chr17  82201630      - | 0.8258608 0.8342132 0.7833352
##   cg12950382    chr14 104710399      - | 0.9221174 0.9384120 0.9030496
##   cg02115394    chr13 114234693      - | 0.0870580 0.0858822 0.0855477
##   cg25813447     chrX  38801258      + | 0.4203698 0.3365862 0.3448609
##   cg07779434     chrX  14873227      + | 0.3743789 0.3703045 0.3686351
##   cg13417420    chr12  12696225      + | 0.0308294 0.0230513 0.0306978
##   cg12480843     chr8  73879050      + | 0.0257384 0.0236206 0.0288161
##               WR069V9I
##              <numeric>
##   cg18478105 0.0168954
##   cg09835024 0.0512708
##   cg14361672 0.8487040
##   cg01763666 0.8522306
##   cg12950382 0.9232085
##   cg02115394 0.0777868
##   cg25813447 0.3797780
##   cg07779434 0.3673453
##   cg13417420 0.0264349
##   cg12480843 0.0262686
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
##      width seq                                              names               
##  [1]     1 C                                                cg18478105
##  [2]     1 C                                                cg09835024
##  [3]     1 G                                                cg14361672
##  [4]     1 G                                                cg01763666
##  [5]     1 G                                                cg12950382
##  [6]     1 G                                                cg02115394
##  [7]     1 C                                                cg25813447
##  [8]     1 C                                                cg07779434
##  [9]     1 C                                                cg13417420
## [10]     1 C                                                cg12480843
ranges(epic_gr)[strand(epic_gr) == '-'] <- shift(ranges(epic_gr)[strand(epic_gr) == '-'], 1)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
##              seqnames    ranges strand |  WR025V1I  WR025V9I  WR069V1I
##                 <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric>
##   cg18478105    chr20  63216298      + | 0.0173053 0.0199615 0.0217486
##   cg09835024     chrX  24054523      + | 0.0326714 0.0517594 0.0346174
##   cg14361672     chr9 128701658      - | 0.8865378 0.8594833 0.8773512
##   cg01763666    chr17  82201631      - | 0.8258608 0.8342132 0.7833352
##   cg12950382    chr14 104710400      - | 0.9221174 0.9384120 0.9030496
##   cg02115394    chr13 114234694      - | 0.0870580 0.0858822 0.0855477
##   cg25813447     chrX  38801258      + | 0.4203698 0.3365862 0.3448609
##   cg07779434     chrX  14873227      + | 0.3743789 0.3703045 0.3686351
##   cg13417420    chr12  12696225      + | 0.0308294 0.0230513 0.0306978
##   cg12480843     chr8  73879050      + | 0.0257384 0.0236206 0.0288161
##               WR069V9I
##              <numeric>
##   cg18478105 0.0168954
##   cg09835024 0.0512708
##   cg14361672 0.8487040
##   cg01763666 0.8522306
##   cg12950382 0.9232085
##   cg02115394 0.0777868
##   cg25813447 0.3797780
##   cg07779434 0.3673453
##   cg13417420 0.0264349
##   cg12480843 0.0262686
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
##      width seq                                              names               
##  [1]     1 C                                                cg18478105
##  [2]     1 C                                                cg09835024
##  [3]     1 C                                                cg14361672
##  [4]     1 C                                                cg01763666
##  [5]     1 C                                                cg12950382
##  [6]     1 C                                                cg02115394
##  [7]     1 C                                                cg25813447
##  [8]     1 C                                                cg07779434
##  [9]     1 C                                                cg13417420
## [10]     1 C                                                cg12480843
# load per-position data from EM-seq and WGBS
cov_df <- read.delim('../04_parse_bismark_covs/rarefied.coverages.tsv.gz')
beta_df <- read.delim('../04_parse_bismark_covs/rarefied.methpcts.tsv.gz')

# convert meth % values (0-100) in `beta_df` to betas (0-1)
beta_df[4:ncol(beta_df)] <- beta_df[4:ncol(beta_df)] / 100
head(beta_df)
##   scaffold start_pos end_pos WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER
## 1     chr1     10469   10469       NaN       NaN       NaN       NaN       NaN
## 2     chr1     10470   10470       NaN       NaN       NaN       NaN       NaN
## 3     chr1     10471   10471       NaN       NaN       NaN       NaN       NaN
## 4     chr1     10472   10472       NaN       NaN       NaN       NaN       NaN
## 5     chr1     10484   10484       NaN       NaN       NaN       NaN       NaN
## 6     chr1     10485   10485       NaN       NaN       NaN       NaN       NaN
##   WR069V1WR WR069V9ER WR069V9WR
## 1   0.33333         1       1.0
## 2       NaN       NaN       1.0
## 3   0.66667         1       0.5
## 4       NaN       NaN       1.0
## 5   0.66667         1       1.0
## 6   1.00000       NaN       1.0
# filtering `beta_df` for positions with min 5 coverage in ALL 8 rarefied
# samples was too stringent: initial union of all 55m positions (covered at
# least once in at least one sample) drops to < 1m post-filtering.
#
# requiring min 3 of 4 EM-seq samples && min 3 of 4 WGBS samples produces a
# more workable set of positions, ~7.7m.
#
# get numbers that summarises coverages across EM-seq and WGBS samples
diag_message('Number of positions with min 5 coverage in 4/4 EM-seq samples: ',
             sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) == 4))
## [16:43:38] Number of positions with min 5 coverage in 4/4 EM-seq samples: 9568385
diag_message('Number of positions with min 5 coverage in 4/4 WGBS samples: ',
             sum(rowSums(cov_df[, grepl('WR$', colnames(cov_df))] >= 5) == 4))
## [16:43:40] Number of positions with min 5 coverage in 4/4 WGBS samples: 3274673
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
             sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) >= 3))
## [16:43:43] Number of positions with min 5 coverage in 3/4 EM-seq samples: 27615442
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
             sum(rowSums(cov_df[, grepl('WR$', colnames(cov_df))] >= 5) >= 3))
## [16:43:47] Number of positions with min 5 coverage in 3/4 EM-seq samples: 12354217
# ... WGBS not performing too hot here, tsk tsk

# note: for these lines to work, positions in `beta_df` and `cov_df` must have 
#       the same order (guaranteed by upstream Python script). else `beta_df` 
#       will not be sliced properly
diag_message('Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5. ')
## [16:43:51] Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5.
diag_message('Original nrow: ', nrow(beta_df), '; ')
## [16:43:51] Original nrow: 57672689;
beta_df <- beta_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
                     rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
cov_df <- cov_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
                   rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
diag_message('filtered nrow: ', nrow(beta_df))
## [16:44:05] filtered nrow: 7744836
diag_message('Mean coverage of remaining positions are: ',
             'EM-seq ', mean(as.matrix(cov_df[grepl('ER$', colnames(cov_df))])), '; ',
             'WGBS ', mean(as.matrix(cov_df[grepl('WR$', colnames(cov_df))])))
## [16:44:05] Mean coverage of remaining positions are: EM-seq 7.59187689707051; WGBS 6.70286546416219
# again, WGBS not performing well in terms of coverage

# convert positions from `beta_df` into a GRanges object, so that intersecting
# EPIC data (already in GRanges object) is easy
beta_gr <- GRanges(paste(beta_df$scaffold, beta_df$start_pos, sep=':'))
mcols(beta_gr) <- beta_df[4:ncol(beta_df)]
head(beta_gr)
## GRanges object with 6 ranges and 8 metadata columns:
##       seqnames    ranges strand | WR025V1ER WR025V1WR WR025V9ER WR025V9WR
##          <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric> <numeric>
##   [1]     chr1     16244      * |   0.80000   0.77778   0.85714       0.6
##   [2]     chr1     17407      * |   0.88889   1.00000   1.00000       1.0
##   [3]     chr1     17452      * |   0.94118   1.00000   1.00000       1.0
##   [4]     chr1     17453      * |   0.90000   1.00000   1.00000       1.0
##   [5]     chr1     17478      * |   1.00000   1.00000   1.00000       1.0
##   [6]     chr1     17479      * |   0.92857   1.00000   0.92308       1.0
##       WR069V1ER WR069V1WR WR069V9ER WR069V9WR
##       <numeric> <numeric> <numeric> <numeric>
##   [1]   0.71429   0.83333   0.50000   0.77778
##   [2]   1.00000   1.00000   1.00000   0.83333
##   [3]   1.00000   1.00000   1.00000   1.00000
##   [4]   1.00000   1.00000   1.00000   1.00000
##   [5]   1.00000   0.92308   1.00000   1.00000
##   [6]   0.87500   1.00000   0.85714   1.00000
##   -------
##   seqinfo: 25 sequences from an unspecified genome; no seqlengths
# aggressively subset both GRanges to the intersection of both
diag_message('# positions in `epic_gr` before intersection: ', length(epic_gr))
## [16:44:29] # positions in `epic_gr` before intersection: 864755
epic_gr <- subsetByOverlaps(epic_gr, beta_gr, type='equal', ignore.strand=TRUE)
beta_gr <- subsetByOverlaps(beta_gr, epic_gr, type='equal', ignore.strand=TRUE)
epic_gr <- sort(sortSeqlevels(epic_gr), ignore.strand=TRUE)
beta_gr <- sort(sortSeqlevels(beta_gr), ignore.strand=TRUE)
diag_message('# positions in `epic_gr` after intersection: ', length(epic_gr))
## [16:44:30] # positions in `epic_gr` after intersection: 103670
# sanity check after subsetting: lengths are identical, sequence names are in
# same order, as well as start/end values
stopifnot(length(epic_gr) == length(beta_gr))
stopifnot(identical(seqnames(epic_gr), seqnames(beta_gr)))
stopifnot(identical(start(epic_gr), start(beta_gr)))
stopifnot(identical(end(epic_gr), end(beta_gr)))

# this allows for the cbinding of metadata together. store the metadata
# in `epic_gr`, which is better annotated than `beta_gr`
values(epic_gr) <- cbind(values(beta_gr), values(epic_gr))

# transform the metadata into a proper data.frame (not the 'DataFrame' S4 object
# used in GRanges), then calculate per-method mean methylation levels
wide_df <- as.data.frame(values(epic_gr))
wide_df$meanER <- rowMeans(wide_df[, grepl('ER$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanWR <- rowMeans(wide_df[, grepl('WR$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanI <- rowMeans(wide_df[, grepl('I$', colnames(wide_df))])
head(wide_df)
##            WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620   1.00000   0.85714   1.00000   0.80000   1.00000   1.00000
## cg02288058   1.00000   0.80000   0.85000   0.85714   0.05882   0.06154
## cg11330075   0.71429   1.00000   0.80000   0.70000   0.83333   0.60000
## cg01068023   1.00000   1.00000   1.00000   1.00000   1.00000   1.00000
## cg05440980   1.00000   1.00000   0.66667   1.00000   0.90000   0.50000
## cg25838738   1.00000   1.00000   1.00000   0.80000   1.00000   1.00000
##            WR069V9ER WR069V9WR  WR025V1I  WR025V9I  WR069V1I  WR069V9I
## cg24335620   0.92308   1.00000 0.7768391 0.7299938 0.7739380 0.7501038
## cg02288058   0.03774   0.02174 0.3777661 0.2932215 0.3655422 0.3035048
## cg11330075   0.40000   1.00000 0.1661267 0.1625537 0.1878874 0.1681365
## cg01068023   1.00000   1.00000 0.8365245 0.8375699 0.8347850 0.8662742
## cg05440980   1.00000   1.00000 0.9088851 0.8995479 0.8970510 0.9004642
## cg25838738   1.00000   1.00000 0.8704974 0.8625925 0.8639985 0.8813814
##               meanER   meanWR     meanI
## cg24335620 0.9807700 0.914285 0.7577187
## cg02288058 0.4866400 0.435105 0.3350087
## cg11330075 0.6869050 0.825000 0.1711761
## cg01068023 1.0000000 1.000000 0.8437884
## cg05440980 0.8916675 0.875000 0.9014871
## cg25838738 1.0000000 0.950000 0.8696174
# plot a PCA to visualise overall profile of datasets. note that PCA doesn't
# like dealing with NA, necessitating the use of complete.cases() to remove
# any rows with NA in them
set.seed(1337)
pca <- prcomp(t(wide_df[complete.cases(wide_df), ]), scale=TRUE)
pca_coords <- as.data.frame(pca$x)
eigs <- pca$sdev ^ 2

pca_df <- data.frame(PC1=pca_coords$PC1,
                     PC2=pca_coords$PC2,
                     PC3=pca_coords$PC3,
                     row.names=rownames(pca_coords))
pca_df$method <- rownames(pca_coords)
pca_df$method <- gsub('^.*ER$', 'EM-seq', pca_df$method)
pca_df$method <- gsub('^.*WR$', 'WGBS', pca_df$method)
pca_df$method <- gsub('^.*I$', 'EPIC', pca_df$method)
pca_df$sample <- gsub('ER$|WR$|I$', '', rownames(pca_df))

g1 <- ggplot(pca_df, aes(x=PC1, y=PC2, color=method, fill=method, shape=sample)) +
  geom_point(size=3, alpha=0.5) +
  scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_shape_manual(values=21:25) +
  xlab(paste0('PC1 (', round(eigs[1] / sum(eigs) * 100, 2), '%)')) +
  ylab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
  theme_minimal(12) +
  theme(legend.position='top', legend.box='vertical', legend.margin=margin())
g2 <- ggplot(pca_df, aes(x=PC2, y=PC3, color=method, fill=method, shape=sample)) +
  geom_point(size=3, alpha=0.5) +
  scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_shape_manual(values=21:25) +
  xlab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
  ylab(paste0('PC3 (', round(eigs[3] / sum(eigs) * 100, 2), '%)')) +
  theme_minimal(12) +
  theme(legend.position='none', legend.box='vertical', legend.margin=margin())
plot_grid(g1, g2, ncol=1, align='v', rel_heights=c(0.55, 0.45))

ggsave('three-way.beta.pca.pdf', width=6, height=6)
# PCA indicates variation is largest across methods (specifically EPIC vs.
# the other two short read methods). There is some variation across samples,
# seen in PC3 (approx PC2 in terms of % variation explained). In PC3, the
# four-sided shapes are on top (WR025) while the three-sided ones are bottom
# (WR069). However, most variation still originate from choice of method,
# hence the calculation of per-method means; subsequent plots are based off
# these mean values
# plot EM-seq vs. WGBS
ggplot(wide_df, aes(x=meanER, y=meanWR)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
  ggtitle('Per-position beta values, EM-seq vs. WGBS') +
  xlab('EM-seq') +
  ylab('WGBS') +
  theme_minimal(12)

# plot EM-seq vs. EPIC
ggplot(wide_df, aes(x=meanER, y=meanI)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
  ggtitle('Per-position beta values, EM-seq vs. EPIC') +
  xlab('EM-seq') +
  ylab('EPIC') +
  theme_minimal(12)

# WGBS vs. EPIC
ggplot(wide_df, aes(x=meanWR, y=meanI)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
  ggtitle('Per-position beta values, WGBS vs. EPIC') +
  xlab('WGBS') +
  ylab('EPIC') +
  theme_minimal(12)

# does GC% influence the discrepancies in beta? re-do scatterplot, but colour
# points by GC%
#
# calculate GC% in a window of WINDOW_BP
WINDOW_BP <- 101  # this value should be odd, so that the base at midpoint
                  # is sandwiched by equal num of bases upstream and downstream
bp_per_side <- (WINDOW_BP - 1) / 2

# letterFrequency(CG) / letterfrequency(ACGT) deals with edge cases where windows
# contain N or non-ACGT bases, which get ignored
window_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, epic_gr + bp_per_side)
wide_df$gcpct <- as.vector(
  letterFrequency(window_seqs, 'CG') / letterFrequency(window_seqs, 'ACGT') * 100)
head(wide_df)
##            WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620   1.00000   0.85714   1.00000   0.80000   1.00000   1.00000
## cg02288058   1.00000   0.80000   0.85000   0.85714   0.05882   0.06154
## cg11330075   0.71429   1.00000   0.80000   0.70000   0.83333   0.60000
## cg01068023   1.00000   1.00000   1.00000   1.00000   1.00000   1.00000
## cg05440980   1.00000   1.00000   0.66667   1.00000   0.90000   0.50000
## cg25838738   1.00000   1.00000   1.00000   0.80000   1.00000   1.00000
##            WR069V9ER WR069V9WR  WR025V1I  WR025V9I  WR069V1I  WR069V9I
## cg24335620   0.92308   1.00000 0.7768391 0.7299938 0.7739380 0.7501038
## cg02288058   0.03774   0.02174 0.3777661 0.2932215 0.3655422 0.3035048
## cg11330075   0.40000   1.00000 0.1661267 0.1625537 0.1878874 0.1681365
## cg01068023   1.00000   1.00000 0.8365245 0.8375699 0.8347850 0.8662742
## cg05440980   1.00000   1.00000 0.9088851 0.8995479 0.8970510 0.9004642
## cg25838738   1.00000   1.00000 0.8704974 0.8625925 0.8639985 0.8813814
##               meanER   meanWR     meanI    gcpct
## cg24335620 0.9807700 0.914285 0.7577187 64.35644
## cg02288058 0.4866400 0.435105 0.3350087 32.67327
## cg11330075 0.6869050 0.825000 0.1711761 49.50495
## cg01068023 1.0000000 1.000000 0.8437884 49.50495
## cg05440980 0.8916675 0.875000 0.9014871 46.53465
## cg25838738 1.0000000 0.950000 0.8696174 40.59406
# check distribution of GC% values, to select a colour scheme for the heatmap
# centered appropriately over the median
quantile(wide_df$gcpct, c(0.05, .1, .5, .9, .95))
##       5%      10%      50%      90%      95% 
## 32.67327 34.65347 46.53465 60.39604 63.36634
# hmm. human genome GC% is 42%, but perhaps EPIC probes are preferentially
# chosen around CpG islands, bumping the GC% up slightly. choose a [30, 70]
# colour scale for subsequent plots
# 
# plot EM-seq vs. WGBS and colour points by GC%
g3 <- ggplot(wide_df, aes(x=meanER, y=meanWR, color=gcpct)) +
  geom_point(size=0.5, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
  ggtitle('Per-position beta values') +
  xlab('EM-seq') +
  ylab('WGBS') +
  theme_minimal(12) +
  theme(legend.position=c(0.75, 0.1))

# plot EM-seq vs. EPIC and colour points by GC%
g4 <- ggplot(wide_df, aes(x=meanER, y=meanI, color=gcpct)) +
  geom_point(size=0.5, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
  ggtitle('') +
  xlab('EM-seq') +
  ylab('EPIC') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

# plot WGBS vs. EPIC and colour points by GC%
g5 <- ggplot(wide_df, aes(x=meanWR, y=meanI, color=gcpct)) +
  geom_point(size=0.5, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
  ggtitle('') +
  xlab('WGBS') +
  ylab('EPIC') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

# do, like, proper stats
wide_df$delta_wgbs_emseq <- wide_df$meanWR - wide_df$meanER
summary(lm(wide_df$delta_wgbs_emseq ~ wide_df$gcpct))
## 
## Call:
## lm(formula = wide_df$delta_wgbs_emseq ~ wide_df$gcpct)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62561 -0.04527  0.00354  0.04517  0.53147 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -3.373e-03  1.437e-03  -2.347   0.0189 *
## wide_df$gcpct -3.291e-06  2.998e-05  -0.110   0.9126  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09231 on 103668 degrees of freedom
## Multiple R-squared:  1.162e-07,  Adjusted R-squared:  -9.53e-06 
## F-statistic: 0.01204 on 1 and 103668 DF,  p-value: 0.9126
g6 <- ggplot(wide_df, aes(x=gcpct, y=delta_wgbs_emseq)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(20, 80), ylim=c(-0.5, 0.5)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_wgbs_emseq'), parse=TRUE) +
  ggtitle('Per-position residual beta values vs. GC%') +
  xlab('GC%') +
  ylab('Residual WGBS - EM-seq') +
  theme_minimal(12)

wide_df$delta_ont_emseq <- wide_df$meanI - wide_df$meanER
g7 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_emseq)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(20, 80), ylim=c(-0.5, 0.5)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_ont_emseq'), parse=TRUE) +
  ggtitle('') +
  xlab('GC%') +
  ylab('Residual EPIC - EM-seq') +
  theme_minimal(12)

wide_df$delta_ont_wgbs <- wide_df$meanI - wide_df$meanWR
g8 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_wgbs)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(20, 80), ylim=c(-0.5, 0.5)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_ont_wgbs'), parse=TRUE) +
  ggtitle('') +
  xlab('GC%') +
  ylab('Residual EPIC - WGBS') +
  theme_minimal(12)
plot_grid(g3, g6, g4, g7, g5, g8, ncol=2, rel_heights=c(1, 1, 1),
          labels=c('A', 'B', 'C', 'D', 'E', 'F'))
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

ggsave('three-way.beta_and_gc.pdf', width=10, height=12)

# sanity check: confirm that scatterplots, whilst slightly overplotted, are
# visually accurate--replots of these with fewer points (10k) should show
# similar trends
set.seed(42)
wide_df_10k <- wide_df[sample(nrow(wide_df), 10000), ]
g3_10k <- ggplot(wide_df_10k, aes(x=meanER, y=meanWR, color=gcpct)) +
  geom_point(size=1, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df_10k, 'meanER', 'meanWR'), parse=TRUE) +
  ggtitle('Per-position methylation levels') +
  xlab('EM-seq (%)') +
  ylab('WGBS (%)') +
  theme_minimal(12) +
  theme(legend.position=c(0.75, 0.1))

# plot EM-seq vs. EPIC and colour points by GC%
g4_10k <- ggplot(wide_df_10k, aes(x=meanER, y=meanI, color=gcpct)) +
  geom_point(size=1, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df_10k, 'meanER', 'meanI'), parse=TRUE) +
  ggtitle('') +
  xlab('EM-seq (%)') +
  ylab('EPIC (%)') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

# plot WGBS vs. EPIC and colour points by GC%
g5_10k <- ggplot(wide_df_10k, aes(x=meanWR, y=meanI, color=gcpct)) +
  geom_point(size=1, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df_10k, 'meanWR', 'meanI'), parse=TRUE) +
  ggtitle('') +
  xlab('WGBS (%)') +
  ylab('EPIC (%)') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

plot_grid(g3, g3_10k, g4, g4_10k, g5, g5_10k, ncol=2, rel_heights=c(1, 1, 1),
          labels=c('A', 'B', 'C', 'D', 'E', 'F'))
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

# ... uh, looks ok to me?


# hmm. there's some interesting trends in the high GC% region. how many positions
# have high GC% in its immediate context?
diag_message('Positions with GC > 70% in immediate context: ', sum(wide_df$gcpct > 70))
## [16:45:11] Positions with GC > 70% in immediate context: 1226
diag_message('Positions with GC > 75% in immediate context: ', sum(wide_df$gcpct > 75))
## [16:45:11] Positions with GC > 75% in immediate context: 235
# don't think analyses using these collection of points would be convincing...

# list deps used in this script
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux bookworm/sid
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blis-openmp/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3                ggplot2_3.4.0                    
##  [3] cowplot_1.1.1                     BSgenome.Hsapiens.UCSC.hg38_1.4.4
##  [5] BSgenome_1.64.0                   rtracklayer_1.56.1               
##  [7] GenomicRanges_1.48.0              Biostrings_2.64.1                
##  [9] GenomeInfoDb_1.32.4               XVector_0.36.0                   
## [11] IRanges_2.30.1                    S4Vectors_0.34.0                 
## [13] BiocGenerics_0.42.0              
## 
## loaded via a namespace (and not attached):
##  [1] MatrixGenerics_1.8.1        Biobase_2.56.0             
##  [3] sass_0.4.4                  jsonlite_1.8.3             
##  [5] splines_4.2.2               bslib_0.4.1                
##  [7] assertthat_0.2.1            highr_0.9                  
##  [9] GenomeInfoDbData_1.2.8      Rsamtools_2.12.0           
## [11] yaml_2.3.6                  pillar_1.8.1               
## [13] lattice_0.20-45             glue_1.6.2                 
## [15] digest_0.6.30               colorspace_2.0-3           
## [17] htmltools_0.5.3             Matrix_1.5-3               
## [19] XML_3.99-0.12               pkgconfig_2.0.3            
## [21] zlibbioc_1.42.0             scales_1.2.1               
## [23] BiocParallel_1.30.4         tibble_3.1.8               
## [25] mgcv_1.8-41                 generics_0.1.3             
## [27] farver_2.1.1                cachem_1.0.6               
## [29] withr_2.5.0                 SummarizedExperiment_1.26.1
## [31] cli_3.4.1                   magrittr_2.0.3             
## [33] crayon_1.5.2                evaluate_0.18              
## [35] fansi_1.0.3                 nlme_3.1-160               
## [37] textshaping_0.3.6           tools_4.2.2                
## [39] BiocIO_1.6.0                lifecycle_1.0.3            
## [41] matrixStats_0.63.0          stringr_1.4.1              
## [43] munsell_0.5.0               DelayedArray_0.22.0        
## [45] compiler_4.2.2              jquerylib_0.1.4            
## [47] systemfonts_1.0.4           rlang_1.0.6                
## [49] grid_4.2.2                  RCurl_1.98-1.9             
## [51] rjson_0.2.21                bitops_1.0-7               
## [53] labeling_0.4.2              rmarkdown_2.18             
## [55] restfulr_0.0.15             gtable_0.3.1               
## [57] codetools_0.2-18            DBI_1.1.3                  
## [59] R6_2.5.1                    GenomicAlignments_1.32.1   
## [61] knitr_1.41                  dplyr_1.0.10               
## [63] fastmap_1.1.0               utf8_1.2.2                 
## [65] ragg_1.2.4                  stringi_1.7.8              
## [67] parallel_4.2.2              vctrs_0.5.1                
## [69] tidyselect_1.2.0            xfun_0.35